{
  "cells": [
    {
      "cell_type": "code",
      "execution_count": 1,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1063: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n",
            "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1071: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n",
            "d:\\softwares\\python\\lib\\site-packages\\umap\\distances.py:1086: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n",
            "d:\\softwares\\python\\lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
            "  from .autonotebook import tqdm as notebook_tqdm\n",
            "d:\\softwares\\python\\lib\\site-packages\\umap\\umap_.py:660: NumbaDeprecationWarning: \u001b[1mThe 'nopython' keyword argument was not supplied to the 'numba.jit' decorator. The implicit default value for this argument is currently False, but it will be changed to True in Numba 0.59.0. See https://numba.readthedocs.io/en/stable/reference/deprecation.html#deprecation-of-object-mode-fall-back-behaviour-when-using-jit for details.\u001b[0m\n",
            "  @numba.jit()\n"
          ]
        }
      ],
      "source": [
        "import numpy as np\n",
        "from bertopic import BERTopic\n",
        "from sklearn.feature_extraction.text import CountVectorizer\n",
        "from transformers.pipelines import pipeline\n",
        "from umap import UMAP\n",
        "from sklearn.cluster import KMeans # 导入Kmeans"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 加载数据"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 2,
      "metadata": {},
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "条数:  1000\n",
            "预览第一条:  文旅文 创看 洛阳 河南省 文旅文创 发展 大会 本次 大会 安排 项目 签约 主要 方面 内容 一是 文旅 产业 项目 签约 截至 目前 梳理 重点 文旅 项目 投资总额 525.6 亿元 遴选 重大项目 进行 现场 签约 投资总额 365.8 亿元 项目 包括 文物 数字化 开发 文化 创意 园区 建设 文化 项目 涵盖 旅游 度假区 建设 旅游 酒店 民宿 打造 旅游 项目 既有 旅游 景区 开发 商旅 综合体 建设 传统 业态 项目 宇宙 基地 沉浸 演艺 业态 项目 充分体现 我省 文化 旅游 发展 特点 趋势 二是 引客 入豫 项目 签约 主要 我省 文旅 部门 文旅 企业 头部 旅行 知名 OTA 平台 重点 客源地 文旅 部门 签订 引客 入豫 协议 持续 拓展 省外 客源 市场\n",
            "\n"
          ]
        }
      ],
      "source": [
        "# step1 加载文件\n",
        "with open('../../data/切词.txt', 'r', encoding='utf-8') as file:\n",
        "  docs = file.readlines()\n",
        "print('条数: ', len(docs))\n",
        "print('预览第一条: ', docs[0])\n",
        "\n",
        "vectorizer_model = None"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {},
      "source": [
        "# 创建"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 3,
      "metadata": {},
      "outputs": [
        {
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "Some weights of the model checkpoint at bert-base-chinese were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']\n",
            "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
            "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
          ]
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "(1000, 768)\n"
          ]
        }
      ],
      "source": [
        "# 1. 词向量模型，同时加载本地训练好的词向量\n",
        "embedding_model = pipeline(\"feature-extraction\", model=\"bert-base-chinese\") # 使用bert-base-chinese\n",
        "embeddings = np.load('../../data/embedding_bbc.npy') # 使用bert-base-chinese向量\n",
        "print(embeddings.shape)\n",
        "\n",
        "# 2. 创建分词模型\n",
        "vectorizer_model = CountVectorizer() # 因为我们已经分好词了，所以这里不需要传入分词函数了\n",
        "\n",
        "# 3. 创建UMAP降维模型\n",
        "umap_model = UMAP(\n",
        "  n_neighbors=15,\n",
        "  n_components=5,\n",
        "  min_dist=0.0,\n",
        "  metric='cosine',\n",
        "  random_state=42  # ⚠️ 防止随机 https://maartengr.github.io/BERTopic/faq.html\n",
        ")\n",
        "\n",
        "# 4. 创建HDBSCAN聚类模型\n",
        "# 如果要建设离群值，可以减小下面两个参数\n",
        "# https://hdbscan.readthedocs.io/en/latest/faq.html\n",
        "\n",
        "# 4. 创建Kmeans模型\n",
        "cluster_model = KMeans(n_clusters=6) # 要聚成几个类，此处为随便填写\n",
        "\n",
        "# 5. 创建CountVectorizer模型\n",
        "vectorizer_model = CountVectorizer(stop_words=['洛阳', '旅游', '文化'])"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 4,
      "metadata": {},
      "outputs": [
        {
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>Topic</th>\n",
              "      <th>Count</th>\n",
              "      <th>Name</th>\n",
              "      <th>Representation</th>\n",
              "      <th>Representative_Docs</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>253</td>\n",
              "      <td>0_发展_建设_城市_项目</td>\n",
              "      <td>[发展, 建设, 城市, 项目, 活动, 河南, 景区, 游客, 洛阳市, 中国]</td>\n",
              "      <td>[行走 河南 读懂 中国 关注 全省 文旅文创 发展 大会 二十大 报告 指出 坚持 以文塑...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>210</td>\n",
              "      <td>1_河南_中国_旅行_历史</td>\n",
              "      <td>[河南, 中国, 旅行, 历史, 景区, 博物馆, 位于, 城市, 王府, 竹海]</td>\n",
              "      <td>[河南省 简称 河南省 历史 大部分 位于 黄河 以南 河南 远古 时期 黄河 中下游 地区...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>198</td>\n",
              "      <td>2_景区_活动_河南_免费</td>\n",
              "      <td>[景区, 活动, 河南, 免费, 时间, 门票, 高速, 大峡谷, 白云山, 博物馆]</td>\n",
              "      <td>[河南 多家 景区 陆续 发布 开园 公告 台风 杜苏芮 强度 逐渐 减弱 河南 景区 陆续...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>173</td>\n",
              "      <td>3_景区_年票_游客_旅客</td>\n",
              "      <td>[景区, 年票, 游客, 旅客, 郑州, 假期, 疫情, 防控, 城市, 高速]</td>\n",
              "      <td>[日起 河南 景区 门票 河南省政府 新闻办 召开 新闻 发布会 获悉 豫见 春天 惠游 老...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>109</td>\n",
              "      <td>4_中国_石窟_龙门石窟_艺术</td>\n",
              "      <td>[中国, 石窟, 龙门石窟, 艺术, 世界, 位于, 莫高窟, 造像, 文化遗产, 朝代]</td>\n",
              "      <td>[龙门石窟 中国 石刻 艺术 宝库 现为 世界 文化遗产 全国 重点 文物保护 单位 国家 ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>5</td>\n",
              "      <td>57</td>\n",
              "      <td>5_地方_安阳_古城_很多</td>\n",
              "      <td>[地方, 安阳, 古城, 很多, 时间, 美食, 看到, 感受, 一下, 一定]</td>\n",
              "      <td>[超全 安阳 旅游 攻略 安阳 美食 大全 安阳 简称 这座 七朝 古都 盘踞 河南省 北端...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   Topic  Count             Name  \\\n",
              "0      0    253    0_发展_建设_城市_项目   \n",
              "1      1    210    1_河南_中国_旅行_历史   \n",
              "2      2    198    2_景区_活动_河南_免费   \n",
              "3      3    173    3_景区_年票_游客_旅客   \n",
              "4      4    109  4_中国_石窟_龙门石窟_艺术   \n",
              "5      5     57    5_地方_安阳_古城_很多   \n",
              "\n",
              "                                  Representation  \\\n",
              "0      [发展, 建设, 城市, 项目, 活动, 河南, 景区, 游客, 洛阳市, 中国]   \n",
              "1      [河南, 中国, 旅行, 历史, 景区, 博物馆, 位于, 城市, 王府, 竹海]   \n",
              "2    [景区, 活动, 河南, 免费, 时间, 门票, 高速, 大峡谷, 白云山, 博物馆]   \n",
              "3       [景区, 年票, 游客, 旅客, 郑州, 假期, 疫情, 防控, 城市, 高速]   \n",
              "4  [中国, 石窟, 龙门石窟, 艺术, 世界, 位于, 莫高窟, 造像, 文化遗产, 朝代]   \n",
              "5       [地方, 安阳, 古城, 很多, 时间, 美食, 看到, 感受, 一下, 一定]   \n",
              "\n",
              "                                 Representative_Docs  \n",
              "0  [行走 河南 读懂 中国 关注 全省 文旅文创 发展 大会 二十大 报告 指出 坚持 以文塑...  \n",
              "1  [河南省 简称 河南省 历史 大部分 位于 黄河 以南 河南 远古 时期 黄河 中下游 地区...  \n",
              "2  [河南 多家 景区 陆续 发布 开园 公告 台风 杜苏芮 强度 逐渐 减弱 河南 景区 陆续...  \n",
              "3  [日起 河南 景区 门票 河南省政府 新闻办 召开 新闻 发布会 获悉 豫见 春天 惠游 老...  \n",
              "4  [龙门石窟 中国 石刻 艺术 宝库 现为 世界 文化遗产 全国 重点 文物保护 单位 国家 ...  \n",
              "5  [超全 安阳 旅游 攻略 安阳 美食 大全 安阳 简称 这座 七朝 古都 盘踞 河南省 北端...  "
            ]
          },
          "execution_count": 4,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "topic_model = BERTopic(\n",
        "  embedding_model=embedding_model,\n",
        "  vectorizer_model=vectorizer_model,\n",
        "  umap_model=umap_model,\n",
        "  hdbscan_model=cluster_model, # 传入kmeans模型\n",
        ")\n",
        "\n",
        "topics, probs = topic_model.fit_transform(docs, embeddings=embeddings) #传入训练好的词向量\n",
        "topic_info = topic_model.get_topic_info()\n",
        "topic_info"
      ]
    }
  ],
  "metadata": {
    "kernelspec": {
      "display_name": "Python 3",
      "language": "python",
      "name": "python3"
    },
    "language_info": {
      "codemirror_mode": {
        "name": "ipython",
        "version": 3
      },
      "file_extension": ".py",
      "mimetype": "text/x-python",
      "name": "python",
      "nbconvert_exporter": "python",
      "pygments_lexer": "ipython3",
      "version": "3.10.6"
    }
  },
  "nbformat": 4,
  "nbformat_minor": 2
}
